library(readr)
library(tidyverse)
library(ggplot2)
library(janitor)
library(tidycensus)
library(viridis)
library(tscount)
library(yrbss)
library(zoo)
library(purrr)
import_raw_data = FALSE
if(import_raw_data){
ss = read_csv("../../data_1/school_shootings.csv")
colnames(ss) = ss[1,]
ss = ss[-1,]%>%
clean_names()%>%
select(-c(na,na_2))%>%
filter(reliability_score_1_5!=1)
ss2 = read_csv("../../data_1/ss_pt2.csv")%>%
select(-X1)
colnames(ss2)=colnames(ss)[34:ncol(ss)]
ss[(1426:nrow(ss)),(34:ncol(ss))]=ss2
ss$shooter_age = as.numeric(ss$shooter_age)
write.csv(ss,"ss_data.csv")
}
# ss is csv file with school shooting data (individual incidents)
ss = read_csv("ss_data.csv")%>%
select(-X1)%>%
mutate(year = as.numeric(substr(date, (nchar(date)-3),nchar(date))))%>%
mutate(state = ifelse(state == "St. Croix, US Virgin Islands","St.Croix",state))%>%
distinct_at(.vars = c(1:24), .keep_all = T)%>%
filter(category != "Accident")
# data set to go from state names to abbreviations
states = data.frame(state.name,state.abb)
dc = data.frame(matrix(ncol= 2, nrow = 1))
colnames(dc)=colnames(states)
dc$`state.name` = "District of Columbia"
dc$`state.abb` = "DC"
states = rbind(states,dc)
states$state.name = as.character(states$state.name)
if(import_raw_data){
# 1980-1990
pop8090_raw = read.delim("https://www2.census.gov/programs-surveys/popest/tables/1980-1990/state/asrh/st8090ts.txt" )%>%data.frame()
pop8090_raw = pop8090_raw[c(8:58,64:114),]%>%data.frame()
colnames(pop8090_raw)="var"
pop8090 = data.frame(pop8090_raw$var[1:51],pop8090_raw$var[52:102])
colnames(pop8090)=c("var1","var2")
pop8090 = pop8090%>%
separate(var1, c("state",as.character(seq(1980,1984,by = 1))), extra = "merge")%>%
separate(var2, c("state1",as.character(seq(1985,1990,by = 1))), extra = "merge")%>%
select(c(state,starts_with("19")))
# 1970 - 1980
pop7080_raw = read.delim("https://www2.census.gov/programs-surveys/popest/tables/1980-1990/state/asrh/st7080ts.txt")
pop7080_raw = pop7080_raw[c(10:61,63:114),]%>%data.frame()
colnames(pop7080_raw)="var"
pop7080 = data.frame(pop7080_raw$var[1:51],pop7080_raw$var[53:103])
colnames(pop7080)=c("var1","var2")
pop7080 = pop7080%>%
separate(var1, c("blnk","id","state","1970","1971","1972","1973","1974","1975"), extra = "merge")%>%
separate(var2, c("blnk1","id1","state1",as.character(seq(1976,1980,by = 1))), extra = "merge")%>%
select(c(state,starts_with("19")))
# 1990-2000
pop90_00 = read_csv("../../data/pop_90-00.csv")%>%
rename(state = X1)%>%
select(c(state,3:13))%>%
filter(state != "USA")
colnames(pop90_00)[2:ncol(pop90_00)]=as.character(seq(1990,2000,by = 1))
pop90_00$state[30:35] = states$state.name[29:34]
pop90_00$state[40:42] = states$state.name[39:41]
pop90_00$state[9] = states$state.name[51]
pop90_00$state[49] = states$state.name[48]
# 2000-2010
pop00_10 = read.csv("https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/state/st-est00int-agesex.csv")%>%
clean_names()%>%
filter(sex == 0, age == 999)%>%
select(c(name,starts_with("popestimate")))%>%
rename_at(vars(starts_with("popestimate")),funs(substr(.,start = 12, stop = 15)))%>%
rename(state = name)%>%
mutate(state = as.character(state))%>%
mutate(state = ifelse(state=="District of Columbia",as.character(states$state.name[51]),state))%>%
filter(state != "United States")
# 2010-2018
pop10_18 = read_csv("../../data/pop_2010-2018.csv")
colnames(pop10_18)=pop10_18[1,]
pop10_18 = pop10_18%>%
slice(-1)%>%
rename_at(vars(starts_with("Pop")), funs(substr(.,start = 38,stop = 41)))%>%
rename(state = Geography)%>%
select(c(state,starts_with("20")))%>%
mutate(state = ifelse(state=="District of Columbia",as.character(states$state.name[51]),state))
# 2019
pop2019 = read_csv("../../data/2019pop.csv")%>%
clean_names()%>%
select(c(state,pop))%>%
rename("2019"=pop)
pops = left_join(pop7080,pop8090%>%select(-c("1980")), by = "state")%>%
left_join(states,by = c("state"="state.abb"))%>%
mutate(state = state.name)%>%
select(-state.name)%>%
left_join(pop90_00%>%select(-c("1990")), by = "state")%>%
left_join(pop00_10%>%select(-c("2000")), by = "state")%>%
left_join(pop10_18%>%select(-c("2010")), by = "state")%>%
left_join(pop2019,by = "state")%>%
gather(key = "year", value = "population", c(2:51))
write.csv(pops, "state_populations.csv")
}
# read in population data (include abbreviated and full state name)
pops = read_csv("state_populations.csv")%>%
select(-X1)%>%
left_join(states, by = c("state"="state.name"))
sts = unique(ss$state)
yrs = seq(range(ss$year)[1],range(ss$year)[2],by=1)
st_yr = data.frame(state = rep(sts,length(yrs)), year = sort(rep(yrs,length(sts))))%>%
arrange(state)
## data set aggrgated by state and year
ss_counts = ss%>%
group_by(state,year) %>%
summarize(incident_count = n(),
total_victims = sum(total_injured_killed_victims),
total_fatalities = sum(killed_includes_shooter),
total_wounded = sum(wounded),
avg_victims = mean(total_injured_killed_victims),
avg_fatalities = mean(killed_includes_shooter),
ave_wounded = mean(wounded),
avg_shooter_age = mean(shooter_age),
max_shooter_age = max(shooter_age),
min_shooter_age = min(shooter_age),
avg_reliability = mean(reliability_score_1_5),
target = mean(ifelse(targeted_specific_victim_s=="Y",1,
ifelse(targeted_specific_victim_s=="N",0,NA)),na.rm = T),
random_vict = mean(ifelse(random_victims=="Y",1,
ifelse(random_victims=="N",0,NA)),na.rm = T),
bullied = mean(ifelse(bullied_y_n_n_a=="Y",1,
ifelse(bullied_y_n_n_a=="N",0,NA)),na.rm = T),
domestic_violence = mean(ifelse(domestic_violence_y_n=="Y",1,
ifelse(domestic_violence_y_n=="N",0,NA)),na.rm = T))%>%
mutate(in_ss = TRUE)%>%
full_join(st_yr, by = c("state","year"))%>%
left_join(pops%>%rename(full_state_name = state), by = c("state"="state.abb", "year"))%>%
mutate(incident_count = ifelse(is.na(incident_count),0,incident_count))%>%
mutate(in_ss = ifelse(is.na(in_ss),FALSE,in_ss))
# read in state grade data
if(import_raw_data){
grades18 = read_csv("../../data_1/2018grades.csv")%>%
select(-c(X6,X7))%>%
clean_names()%>%
select(-gun_death_rate_per_100k)%>%
mutate(year = "2018")%>%
rename(grade = x2018_grade)%>%
select(state,grade,year)
grades17 = read_csv("../../data_1/2017grades.csv")%>%
clean_names()%>%
mutate(year = "2017")%>%
rename(grade = x2017_grade)%>%
select(state,grade,year)
grades16 = read_csv("../../data_1/2016grades.csv")%>%
clean_names()%>%
mutate(year = "2016")%>%
rename(grade = x2016_grade)%>%
select(state,grade,year)
grades15 = read_csv("../../data_1/2015grades.csv")%>%
clean_names()%>%
mutate(year = "2015")%>%
rename(grade = x2015_grade)%>%
select(state,grade,year)
# add in missing value
grades15 = rbind(grades15,c("Kansas","F",2015))
grades14 = read_csv("../../data_1/2014grades.csv")%>%
clean_names()%>%
mutate(year = "2014")%>%
rename(grade = x2014_grade)%>%
select(state,grade,year)
grades13 = read_csv("../../data_1/2013grades.csv")%>%
clean_names()%>%
mutate(year = "2013")%>%
rename(grade = x2013_grade)%>%
select(state,grade,year)
grades12 = read_csv("../../data_1/2012grades.csv")%>%
clean_names()%>%
mutate(year = "2012")%>%
rename(grade = x2012_grade)%>%
select(state,grade,year)
# combine all year data
grades = bind_rows(grades18,grades17)%>%
bind_rows(grades16)%>%
bind_rows(grades15)%>%
bind_rows(grades14)%>%
bind_rows(grades13)%>%
bind_rows(grades12)
write.csv(grades,"state_grades.csv")
}
grades = read_csv("state_grades.csv")%>%
select(-X1)
gpa_convert = data.frame(letter_grade = c("A","A-","B+","B","B-",
"C+","C","C-","D+","D","D-","F"),
gpa = c(4,3.7,3.3,3,2.7,2.3,2,1.7,1.3,1,0.7,0))
ss_counts = ss_counts%>%
left_join(grades%>%select(state,grade,year),by = c("full_state_name"="state","year"))%>%
left_join(gpa_convert, by = c("grade"="letter_grade"))
media = read_csv("media_data.csv")%>%
separate(`Year-Month`, into = c("year","month"), sep = "-")%>%
clean_names()%>%
select(-starts_with("x"))%>%
group_by(year) %>%
mutate(yearly_articles = sum(articles_shootings_excluding_firearm_laws_and_regulations))%>%
mutate(yearly_shootings = sum(mass_shooting))%>%
mutate(art_per_inc = yearly_articles/yearly_shootings)%>%
ungroup()%>%
mutate(prev_month_art = c(0,articles_shootings_excluding_firearm_laws_and_regulations[1:227]))%>%
mutate(prev_year_art = c(rep(0,12),yearly_articles[1:216]))%>%
mutate(year = as.numeric(year))
ss_counts = ss_counts%>%
left_join(media%>%select(prev_year_art, year,yearly_articles)%>%distinct(), by = "year")
pov=read.csv("census_poverty_data.csv",stringsAsFactors = FALSE)
pov=filter(pov,State!=" United States"&State!=" United States"& State!="District of Columbia"&State!="United States")
names(pov)[1:2]<-c("state","poverty")
names(pov)[4]<-"year"
skel=expand_grid(sort(unique(pov$state)),min(pov$year):(max(pov$year)+2))
names(skel)<-c("state","year")
sk1=filter(skel,year==2007|year==2008|year==2009)
sk1=left_join(sk1,filter(pov,year==2007),by="state")
sk2=filter(skel,year==2010|year==2011|year==2012)
sk2=left_join(sk2,filter(pov,year==2010),by="state")
sk3=filter(skel,year==2013|year==2014|year==2015)
sk3=left_join(sk3,filter(pov,year==2013),by="state")
sk4=filter(skel,year==2016|year==2017|year==2018)
sk4=left_join(sk4,filter(pov,year==2016),by="state")
pov=rbind(sk1,sk2,sk3,sk4)
pov=pov[,1:3]
names(pov)<-c("state","year","poverty")
pov = left_join(pov,states, by = c("state"="state.name"))%>%
select(-state)%>%
rename(state = state.abb)
ss_counts = left_join(ss_counts, pov, by = c("state","year"))
bully = read_csv("State_bullying.csv")%>%
select(c(total_score,state.abb))%>%
rename(state = state.abb)%>%
mutate(year = 2010)
ss_counts = left_join(ss_counts, bully%>%select(-year), by = "state")%>%
rename(bullying_score = total_score)%>%
mutate(bullying_score = as.numeric(bullying_score))
mh.data=read.csv("mh.data.csv",stringsAsFactors = FALSE)%>%
clean_names()%>%
select(-x)%>%
left_join(states, by = c("state"="state.name"))%>%
select(-state)%>%
rename(state = state.abb)
ss_counts = ss_counts%>%
left_join(mh.data, by = c("state","year"))%>%
rename(mh_score = percent)
## Warning: Column `state` joining character vector and factor, coercing into
## character vector
qns = c("qn13","qn14","qn15","qn16","qn17","qn18","qn19","qn20","qn24","qn25","qn26","qn27","qn28","qn29","qn30","qnbullyweight","qnbullygay")
sts = yrbss::getListOfParticipatingStates()
yrs = c(1991,1993,1995,1997,1999,2001,2003,2005,2007,2009,2011,2013,2015)
if(import_raw_data){
bullying = data.frame(variable = NULL, state = NULL, year = NULL, prop= NULL, ciLB= NULL, ciUB = NULL, n = NULL)
for(v in qns){
for(s in sts){
for(y in yrs ){
p = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$prop
ciL = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$ciLB
ciU = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$ciUB
sample_size = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$n
newrow = data.frame(variable = v, state = s, year = y, prop= p, ciLB= ciL, ciUB = ciU, n = sample_size)
bullying = bind_rows(bullying,newrow)}
}}
lab = data.frame(question = yrbss::yrbss_questions_binary$variable, label = yrbss::yrbss_questions_binary$label)%>%
filter(question %in% qns)
bullying = left_join(bullying,lab, by = c(variable = "question"))
bullying$state[which(bullying$state=="AZB")]="AZ"
write_csv(bullying, "bullying_survey.csv")
}
lab = data.frame(question = yrbss::yrbss_questions_binary$variable, label = yrbss::yrbss_questions_binary$label)%>%
filter(question %in% qns)
# read in csv
bullying_survey = read_csv("bullying_survey.csv")
## Parsed with column specification:
## cols(
## variable = col_character(),
## state = col_character(),
## year = col_double(),
## prop = col_double(),
## ciLB = col_double(),
## ciUB = col_double(),
## n = col_double(),
## label = col_character()
## )
# check missingness for 2009-2015
sum(!complete.cases(filter(bullying_survey, year>=2007)))/nrow(filter(bullying_survey, year>=2007))
## [1] 0.3135135
### We're going to need to impute values for missing years and states for some of these variables if these data are going to be useful in our analysis
# combine suicide questions
suicide = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
suicide_qns = bullying_survey%>%
filter(variable %in% c("qn26","qn27","qn28","qn29","qn30"))
sts[which(sts=="AZB")]="AZ"
for(y in yrs){
for(st in sts){
dat = filter(suicide_qns,(year ==y & state == st))
suicide_score = weighted.mean(dat$prop,1/(dat$n), na.rm = TRUE)
samp = mean(dat$n,na.rm = TRUE)
newrow = data.frame(year = y, state = st, score = suicide_score, n = samp)
suicide = bind_rows(suicide,newrow)
}
}
# combine questions involving physical fights
fight = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
fight_qns = bullying_survey%>%
filter(variable %in% c("qn18","qn19","qn20"))
for(y in yrs){
for(st in sts){
dat = filter(fight_qns,(year ==y & state == st))
fight_score = weighted.mean(dat$prop,1/(dat$n), na.rm = TRUE)
samp = mean(dat$n,na.rm = TRUE)
newrow = data.frame(year = y, state = st, score = fight_score, n = samp)
fight = bind_rows(fight,newrow)
}
}
# school safety questions
safety = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
safety_qns = bullying_survey%>%
filter(variable %in% c("qn13","qn14","qn15","qn16","qn17"))
for(y in yrs){
for(st in sts){
dat = filter(safety_qns,(year ==y & state == st))
safety_score = weighted.mean(dat$prop,1/(dat$n), na.rm = TRUE)
samp = mean(dat$n,na.rm = TRUE)
newrow = data.frame(year = y, state = st, score = safety_score, n = samp)
safety = bind_rows(safety,newrow)
}
}
# bullying questions
bull = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
bully_qns = bullying_survey%>%
filter(variable %in% c("qn24","qn25","qnbullyweight","qnbullygay"))
for(y in yrs){
for(st in sts){
dat = filter(bully_qns,(year ==y & state == st))
bully_score = weighted.mean(dat$prop,1/(dat$n), na.rm = TRUE)
samp = mean(dat$n,na.rm = TRUE)
newrow = data.frame(year = y, state = st, score = bully_score, n = samp)
bull = bind_rows(bull,newrow)
}
}
survey_combined = bind_cols(suicide,safety, fight, bull)%>%
select(year, state, score, n, score1, n1, score2, n2, score3, n3)%>%
rename(suicide_score = score, school_safety_score = score1, fight_score = score2, bully_score = score3)%>%
filter(year>=2007)%>%
full_join(st_yr%>%filter(year>=2007))%>%
arrange(year)%>%
arrange(state)%>%
mutate(state = ifelse(state == "AZB","AZ",state))
survey_combined = survey_combined%>%
group_by(state)%>%
mutate(suicide_score =ifelse(year<2016,ifelse(is.na(suicide_score), na.locf(suicide_score)/2+na.locf(suicide_score, fromLast = TRUE)/2,suicide_score),
NA ))%>%
mutate(school_safety_score =ifelse(year<2016,ifelse(is.na(school_safety_score), na.locf(school_safety_score)/2+na.locf(school_safety_score, fromLast = TRUE)/2,school_safety_score),NA))%>%
mutate(fight_score =ifelse(year<2016,ifelse(is.na(fight_score), na.locf(fight_score)/2+na.locf(fight_score, fromLast = TRUE)/2,fight_score),NA))%>%
mutate(bully_score =ifelse(year<2016,ifelse(is.na(bully_score), na.locf(bully_score)/2+na.locf(bully_score, fromLast = TRUE)/2,bully_score),NA))%>%
mutate(n=ifelse(year<2016,ifelse(is.na(n), na.locf(n)/2+na.locf(n, fromLast = TRUE)/2,n),NA))%>%
mutate(n1=ifelse(year<2016,ifelse(is.na(n1), na.locf(n1)/2+na.locf(n1, fromLast = TRUE)/2,n1),NA))%>%
mutate(n2=ifelse(year<2016,ifelse(is.na(n2), na.locf(n2)/2+na.locf(n2, fromLast = TRUE)/2,n2),NA))%>%
mutate(n3=ifelse(year<2016,ifelse(is.na(n3), na.locf(n3)/2+na.locf(n3, fromLast = TRUE)/2,n3),NA))
# use fitted line to get 2016-2019 values
for(s in sts[which(!(sts %in% c("MA","AZB")))]){
dat = filter(survey_combined, state ==s)
ind = which(is.na(dat$suicide_score))[1]
slp = dat[ind-1,3:10]-dat[ind-2,3:10]
dat[which(is.na(dat$suicide_score))[1],3:10] = dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp
dat[which(is.na(dat$suicide_score))[1],3:10] = dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp
dat[which(is.na(dat$suicide_score))[1],3:10] = dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp
dat[which(is.na(dat$suicide_score))[1],3:10] = dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp
survey_combined[which(survey_combined$state == s),]=dat
}
# chech to see if mean and median are different
survey_combined%>%
group_by(year)%>%
summarise(med_ss = median(suicide_score, na.rm = T), mn_ss = mean(suicide_score, na.rm = T),
med_safe = median(school_safety_score, na.rm = T), mn_safe = mean(school_safety_score, na.rm = T),
med_f = median(fight_score, na.rm = T), mn_f = mean(fight_score, na.rm = T),
med_b = median(bully_score, na.rm = T), mn_b = mean(bully_score, na.rm = T))
## # A tibble: 13 x 9
## year med_ss mn_ss med_safe mn_safe med_f mn_f med_b mn_b
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2007 0.118 0.123 0.0913 0.0885 0.154 0.155 0.195 0.198
## 2 2008 0.119 0.124 0.0880 0.0872 0.152 0.154 0.195 0.198
## 3 2009 0.123 0.124 0.0843 0.0852 0.145 0.153 0.195 0.198
## 4 2010 0.126 0.125 0.0827 0.0851 0.144 0.146 0.186 0.187
## 5 2011 0.121 0.125 0.0836 0.0843 0.137 0.143 0.185 0.187
## 6 2012 0.129 0.132 0.0827 0.0855 0.128 0.136 0.182 0.187
## 7 2013 0.134 0.137 0.0845 0.0893 0.122 0.127 0.183 0.187
## 8 2014 0.140 0.140 0.0847 0.0869 0.109 0.121 0.189 0.185
## 9 2015 0.143 0.142 0.0810 0.0836 0.105 0.113 0.189 0.185
## 10 2016 0.148 0.144 0.0756 0.0803 0.103 0.106 0.189 0.184
## 11 2017 0.148 0.146 0.0738 0.0770 0.100 0.0985 0.189 0.184
## 12 2018 0.148 0.149 0.0715 0.0737 0.0937 0.0910 0.189 0.184
## 13 2019 0.148 0.151 0.0713 0.0704 0.0887 0.0836 0.189 0.184
# impute missing values with median value for each column
survey_combined = survey_combined%>%
ungroup()%>%
group_by(year)%>%
mutate(suicide_score = ifelse(is.na(suicide_score), median(suicide_score, na.rm = T),suicide_score))%>%
mutate(school_safety_score = ifelse(is.na(school_safety_score), median(school_safety_score, na.rm = T),school_safety_score))%>%
mutate(fight_score = ifelse(is.na(fight_score), median(fight_score, na.rm = T),fight_score))%>%
mutate(bully_score = ifelse(is.na(bully_score), median(bully_score, na.rm = T),bully_score))%>%
mutate(n = ifelse(is.na(n), median(n, na.rm = T),n))%>%
mutate(n1 = ifelse(is.na(n1), median(n1, na.rm = T),n1))%>%
mutate(n2 = ifelse(is.na(n2), median(n2, na.rm = T),n2))%>%
mutate(n3 = ifelse(is.na(n3), median(n3, na.rm = T),n3))
ss_counts = left_join(ss_counts, survey_combined, by = c("state","year"))%>%
filter(state != "DC")
rand.count.data = read_csv("cleaned.rand.csv")%>%
select(-X1)%>%
left_join(states, by = c("state"= "state.name"))%>%
select(-state)%>%
rename(state = state.abb)
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## state = col_character()
## )
## See spec(...) for full column specifications.
ss_counts = left_join(ss_counts,rand.count.data, by = c("state","year"))
## Warning: Column `state` joining character vector and factor, coercing into
## character vector
ss_counts = ss_counts%>%
mutate(access_laws = -1*(age_permissive-age_restrictive+child_access_permissive-child_access_restrictive+prohibited_possessor_permissive-prohibited_possessor_restrictive))%>%
mutate(transport= -1*(castle_doctrine_permissive-castle_doctrine_restrictive+open_carry_permissive-open_carry_restrictive+carrying_permissive-carrying_permissive))%>%
mutate(screening = -1*(background_check_permissive- background_check_restrictive + waiting_period_permissive- waiting_period_restrictive + permit_permissive - permit_restrictive + registration_permissive - registration_restrictive + safety_training_permissive - safety_training_restrictive + dealer_license_permissive - dealer_license_restrictive + one_gun_per_permissive - one_gun_per_restrictive))%>%
mutate(ban = -ban_permissive+ban_restrictive)%>%
select(-c(36:63))
table(ss$reliability_score_1_5)%>%
data.frame()%>%
ggplot(aes(x = Var1, y = Freq))+
geom_bar(aes(fill = Var1), stat = "identity")+
scale_fill_discrete(name = "Reliability Score")+
labs(x = "Reliability Score", y = "Count")
table(ss$school_type)%>%
data.frame()%>%
ggplot(aes(x = Var1, y = Freq))+
geom_bar(aes(fill = Var1), stat = "identity")+
scale_fill_discrete(name = "School Type")+
labs(x = "School Type", y = "Count")
ss%>%
filter(shooter_age<200)%>%
ggplot(aes(x = reorder(state, shooter_age, FUN = mean), y = shooter_age))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
labs(y = "Age of shooter", x = "State", title = "Age of Shooter by State")
ss%>%
ggplot(aes(x = reorder(state, killed_includes_shooter, FUN = mean), y = killed_includes_shooter))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
labs(y = "Numbeer Killed (including shooter)", x = "State", title = "Fatalities")
ss%>%
ggplot(aes(x = reorder(state, wounded, FUN = mean), y = total_injured_killed_victims))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
labs(y = "Wounded ", x = "State", title = "Number Wounded")
ss%>%
ggplot(aes(x = reorder(state, total_injured_killed_victims, FUN = mean), y = total_injured_killed_victims))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
labs(y = "Wounded or Killed", x = "State", title = "Combined Wounded & Fatalities")
table(ss$state)%>%
data.frame()%>%
ggplot(aes(x = Var1, y = Freq))+
geom_bar(aes(x = reorder(Var1,Freq, desc)), stat = "identity")+
labs(x = "State", y = "Count", "Number of incidents by State (overall)")+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))
cc = scales::seq_gradient_pal("green", "red", "Lab")(seq(0,1,length.out=12))
cc2 = scales::seq_gradient_pal("green", "red", "Lab")(seq(0,1,length.out=5))
# This will handle the ordering
d <- ss_counts %>%
filter(year %in% seq(2012,2018,by=1))%>%
mutate(grade = substr(grade,1,1))%>%
ungroup() %>% # As a precaution / handle in a separate .grouped_df method
arrange(year, grade) %>% # arrange by facet variables and continuous values
mutate(.r = row_number()) # Add a row number variable
ggplot(d, aes(x = .r, y= incident_count, fill = grade)) + # Use .r instead of x
geom_point(data = d%>%filter(incident_count==0),
aes(x = .r, y= incident_count, color = grade),
size = 0.7) +
geom_bar(stat = "identity")+
facet_wrap(~ year, scales = "free_x") + # Should free scales (though could be left to user)
scale_x_continuous( # This handles replacement of .r for x
breaks = d$.r, # notice need to reuse data frame
labels = d$state
)+
labs(title = "Incident Count by Grade", x = "State", y = "School Shootings")+
scale_fill_manual(values = cc2, name = "Grade")+
scale_color_manual(values = cc2, guide = FALSE)+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 3.5), legend.position = c(0.7,.15), legend.direction = "horizontal",legend.key.size = unit(1.5,"line"), legend.text = element_text(size = 8), legend.title = element_text(size = 10),plot.title = element_text(hjust = 0.5))
### Outcome plot:
ss_counts%>%
group_by(year)%>%
summarise(ss = sum(incident_count))%>%
ggplot(aes(x = year, y = ss))+
geom_line(color = "red")+
labs(x = "Year", y = "School Shootings", title = "School Shootings in the U.S.")+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))
ss_counts%>%
filter(year %in% seq(2012,2018,by=1))%>%
filter(!is.na(grade))%>%
ggplot(aes(x = incident_count , fill = grade))+
geom_histogram(binwidth = 1)+
facet_wrap(.~year)+
scale_fill_manual(values = cc)+
scale_x_continuous(breaks = seq(0,35,by=5))+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 3.5))+
theme_bw()+
labs(x= "Number of School Shootings")
ss_counts%>%
filter(year %in% seq(2012,2018,by=1))%>%
filter(!is.na(grade))%>%
ggplot(aes(x = grade, y = incident_count, color = grade))+
geom_boxplot()+
facet_wrap(.~year)+
theme_bw()+
scale_color_manual(values = cc)+
labs(y= "Number of School Shootings", x = "Grade")
ss_counts%>%
filter(year %in% seq(2012,2018,by=1))%>%
ggplot()+
geom_bar(aes(x = reorder(state,population), y = incident_count), stat = "identity")+
geom_point(aes(x = state, y = log(population), color = "population"), size = .4)+
theme_bw()+
labs(x = "State", y = "Number of School Shootings", title = "Number of School Shooting and Population by State")+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 3.5), legend.position = c(.7, 0), legend.justification = c(1, 0))+
facet_wrap(.~year)+
scale_color_discrete(name = NULL, labels = "Population (log)")
ss_counts%>%
filter(year %in% seq(2012,2018,by=1))%>%
group_by(state) %>%
mutate(ordr = mean(avg_victims, na.rm = T))%>%
filter(ordr!="NaN")%>%
ungroup()%>%
arrange(desc(ordr))%>%
ggplot(aes(x = reorder(state,ordr), y = avg_victims, color = year))+
geom_point(size = 0.6, position = "jitter")+
labs(x = "State", y = "Average number of victims per incident", title ="Victims per School Shooting")+
scale_color_viridis(option = "D")+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))
media%>%
ggplot(aes(x = prev_month_art, y = mass_shooting, color = as.numeric(year)))+geom_point(position = "jitter")+
scale_color_continuous(name = "Year")+
labs(y = "Mass Shooting Count", x = "Articles in Previous Month")+
theme_bw()
media%>%
ggplot(aes(y = articles_shootings_excluding_firearm_laws_and_regulations, x= as.numeric(year), color = as.numeric(year)))+
geom_point()+
geom_jitter()+
labs(y = "Monthly Articles",x = "Year")+
theme(legend.position = "none")+
theme_bw()
media%>%
ggplot(aes(y = articles_shootings_excluding_firearm_laws_and_regulations/yearly_shootings, x= as.numeric(year), color = as.numeric(year)))+
geom_point()+
geom_jitter()+
labs(y = "Monthly Articles/Yearly Mass Shootings",x = "Year")+
theme(legend.position = "none")+
theme_bw()
media%>%
ggplot(aes(y = yearly_articles, x= as.numeric(year)))+
geom_point()+
labs(y = "Yearly Articles",x = "Year")+
theme_bw()
school_counts = ss%>%
select(school,city,state)%>%
unite(col = school_city,c(school,city,state), sep = "_")%>%
table(dnn = c("school","count"))%>%
data.frame()%>%
separate(col = school_city, into = c("school","city","state"), sep = "_" )
delta = data.frame(state = NULL, year = NULL, delta_gpa = NULL, delta_ss = NULL, delta_pop = NULL)
for(x in unique(ss_counts$state)){
dat = filter(ss_counts, state == x)%>%
arrange(year)
chng_gpa = c(NA,diff(dat$gpa))
chng_ss = c(NA, diff(dat$incident_count))
chng_pop = c(NA, diff(dat$population))
st = rep(x, nrow(dat))
newrows =data.frame(state = st, year = dat$year, delta_gpa = chng_gpa, delta_ss = chng_ss, delta_pop= chng_pop)
delta = bind_rows(delta,newrows)
}
ss_counts%>%
group_by(year)%>%
summarise(yearly_incidents = sum(incident_count), prev_year_art = prev_year_art[1] )%>%
ggplot(aes(x = year, y = yearly_incidents, fill = prev_year_art))+geom_bar(stat = "identity")+
labs(y = "Yearly School Shootings", x = "Year")+
scale_fill_viridis(option = "D", name = "Articles in previous year")+
theme_bw()
ggplot(data= bullying_survey%>%filter(year>=2013), aes(x = year, y = prop, grouby_by = state, color = variable))+
geom_line(position = "jitter")+
scale_color_viridis_d()+
theme_bw()
ggplot(data= bullying_survey%>%filter(variable %in% qns[9:17]), aes(x = year, y = prop, color = variable))+
geom_point(position = "jitter")+
geom_smooth(se=F)+
scale_color_viridis_d()+
theme_bw()
lab = data.frame(question = yrbss::yrbss_questions_binary$variable, label = yrbss::yrbss_questions_binary$label)%>%
filter(question %in% qns)
kableExtra::kable(lab, byrow = F, caption = "Questions topics")
| question | label |
|---|---|
| qn13 | Carried a weapon |
| qn14 | Carried a gun |
| qn15 | Carried a weapon on school property |
| qn16 | Did not go to school because they felt unsafe at school or on their way to or from school |
| qn17 | Were threatened or injured with a weapon on school property |
| qn18 | Were in a physical fight |
| qn19 | Were injured in a physical fight |
| qn20 | Were in a physical fight on school property |
| qn24 | Were bullied on school property |
| qn25 | Were electronically bullied |
| qn26 | Felt sad or hopeless |
| qn27 | Seriously considered attempting suicide |
| qn28 | Made a plan about how they would attempt suicide |
| qn29 | Attempted suicide |
| qn30 | Attempted suicide that resulted in an injury, poisoning, or overdose that had to be treated by a doctor or nurse |
| qnbullyweight | Been teased b/c of weight past 12 mos |
| qnbullygay | Been teased b/c labeled GLB past 12 mos |
bullying_survey%>%
slice(which(complete.cases(bullying_survey)))%>%
select(label)%>%
table()%>%
data.frame()%>%
kableExtra::kable(caption = "Observations per Question")
| . | Freq |
|---|---|
| Attempted suicide | 298 |
| Attempted suicide that resulted in an injury, poisoning, or overdose that had to be treated by a doctor or nurse | 282 |
| Been teased b/c labeled GLB past 12 mos | 18 |
| Been teased b/c of weight past 12 mos | 18 |
| Carried a gun | 231 |
| Carried a weapon | 275 |
| Carried a weapon on school property | 292 |
| Did not go to school because they felt unsafe at school or on their way to or from school | 298 |
| Felt sad or hopeless | 251 |
| Made a plan about how they would attempt suicide | 291 |
| Seriously considered attempting suicide | 308 |
| Were bullied on school property | 124 |
| Were electronically bullied | 94 |
| Were in a physical fight | 298 |
| Were in a physical fight on school property | 294 |
| Were injured in a physical fight | 270 |
| Were threatened or injured with a weapon on school property | 291 |
survey_combined%>%
gather(key = score_type, percent, c(3,5,7,9))%>%
ggplot(aes(x = state, y = percent, color = score_type ))+
geom_boxplot()+
scale_color_viridis_d()+
facet_wrap(.~score_type)+
theme_bw()+
theme(legend.position = "none")+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))
fit_grade = glm(incident_count~gpa+poverty+bullying_score+ prev_year_art+year+suicide_score+fight_score+school_safety_score+bully_score+ offset(log(population)), family = "poisson", data = ss_counts)
summary(fit_grade)
##
## Call:
## glm(formula = incident_count ~ gpa + poverty + bullying_score +
## prev_year_art + year + suicide_score + fight_score + school_safety_score +
## bully_score + offset(log(population)), family = "poisson",
## data = ss_counts)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0505 -0.8770 -0.4999 0.4891 2.2763
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.380e+01 1.163e+02 -0.635 0.526
## gpa -3.174e-01 6.986e-02 -4.543 5.54e-06 ***
## poverty -3.746e-02 2.989e-02 -1.253 0.210
## bullying_score 8.424e-02 5.126e-02 1.644 0.100
## prev_year_art 5.150e-04 3.362e-04 1.532 0.126
## year 2.840e-02 5.772e-02 0.492 0.623
## suicide_score 3.317e+00 4.734e+00 0.701 0.484
## fight_score -3.497e+00 2.584e+00 -1.353 0.176
## school_safety_score -3.752e+00 3.997e+00 -0.939 0.348
## bully_score 2.039e+00 3.411e+00 0.598 0.550
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 314.9 on 299 degrees of freedom
## Residual deviance: 266.2 on 290 degrees of freedom
## (2200 observations deleted due to missingness)
## AIC: 590.52
##
## Number of Fisher Scoring iterations: 5
fit_law = glm(reformulate(names(ss_counts)[c(23,25,26,28,30,32,34,36:39)],names(ss_counts)[3]), offset=log(population), family = "poisson", data = ss_counts)
summary(fit_law)
##
## Call:
## glm(formula = reformulate(names(ss_counts)[c(23, 25, 26, 28,
## 30, 32, 34, 36:39)], names(ss_counts)[3]), family = "poisson",
## data = ss_counts, offset = log(population))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5096 -0.8218 -0.5261 0.3507 3.0972
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.569e+01 1.092e+00 -14.364 < 2e-16 ***
## prev_year_art 5.241e-04 2.221e-04 2.359 0.01831 *
## poverty -2.733e-02 2.226e-02 -1.228 0.21942
## bullying_score 6.512e-02 4.202e-02 1.550 0.12125
## suicide_score 6.863e+00 4.031e+00 1.703 0.08863 .
## school_safety_score -4.545e+00 3.744e+00 -1.214 0.22469
## fight_score -5.625e+00 2.001e+00 -2.810 0.00495 **
## bully_score -2.167e+00 2.887e+00 -0.751 0.45291
## access_laws -7.169e-02 2.706e-02 -2.650 0.00806 **
## transport 8.523e-02 5.206e-02 1.637 0.10160
## screening 4.415e-03 2.902e-02 0.152 0.87908
## ban -2.192e-01 1.212e-01 -1.808 0.07054 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 560.79 on 549 degrees of freedom
## Residual deviance: 485.98 on 538 degrees of freedom
## (1950 observations deleted due to missingness)
## AIC: 966.4
##
## Number of Fisher Scoring iterations: 5
z_fit = pscl::zeroinfl(reformulate(names(ss_counts)[c(23,25,26,28,30,32,34,36:39)],names(ss_counts)[3]), offset=log(population), dist = "poisson", data = ss_counts)
summary(z_fit)
##
## Call:
## pscl::zeroinfl(formula = reformulate(names(ss_counts)[c(23, 25,
## 26, 28, 30, 32, 34, 36:39)], names(ss_counts)[3]), data = ss_counts,
## offset = log(population), dist = "poisson")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -1.7744 -0.5866 -0.2949 0.2511 3.9460
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.615e+01 1.124e+00 -14.370 < 2e-16 ***
## prev_year_art 1.657e-04 2.129e-04 0.778 0.43631
## poverty -5.031e-02 2.362e-02 -2.130 0.03318 *
## bullying_score 4.751e-02 4.317e-02 1.101 0.27105
## suicide_score 7.688e+00 4.212e+00 1.825 0.06795 .
## school_safety_score 2.312e+00 4.132e+00 0.559 0.57586
## fight_score -6.433e+00 2.017e+00 -3.189 0.00143 **
## bully_score 2.701e+00 3.122e+00 0.865 0.38687
## access_laws -7.835e-02 2.716e-02 -2.885 0.00392 **
## transport 1.149e-01 5.516e-02 2.083 0.03728 *
## screening -1.614e-02 2.988e-02 -0.540 0.58915
## ban -8.817e-02 1.226e-01 -0.719 0.47209
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -23.87916 13.58751 -1.757 0.0788 .
## prev_year_art -0.02227 0.01072 -2.077 0.0378 *
## poverty -0.43610 0.29507 -1.478 0.1394
## bullying_score -0.41529 0.35063 -1.184 0.2363
## suicide_score 95.20197 55.58781 1.713 0.0868 .
## school_safety_score 92.93063 57.04815 1.629 0.1033
## fight_score -8.05032 32.49863 -0.248 0.8044
## bully_score 140.32318 72.99385 1.922 0.0546 .
## access_laws -0.47490 0.37900 -1.253 0.2102
## transport -1.24559 1.05967 -1.175 0.2398
## screening -1.44754 0.69171 -2.093 0.0364 *
## ban 4.11096 2.84593 1.445 0.1486
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 197
## Log-likelihood: -454.3 on 24 Df
fit_law_victims = MASS::glm.nb(total_victims ~ prev_year_art + poverty + bullying_score + suicide_score + school_safety_score + fight_score + bully_score + access_laws +
transport + screening + ban+ offset(log(population)), data = ss_counts)
summary(fit_law_victims)
##
## Call:
## MASS::glm.nb(formula = total_victims ~ prev_year_art + poverty +
## bullying_score + suicide_score + school_safety_score + fight_score +
## bully_score + access_laws + transport + screening + ban +
## offset(log(population)), data = ss_counts, init.theta = 2.051095288,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4073 -0.8123 -0.2356 0.3870 3.4578
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.716e+01 1.327e+00 -12.930 < 2e-16 ***
## prev_year_art -4.840e-05 2.873e-04 -0.168 0.866212
## poverty -8.310e-02 2.897e-02 -2.869 0.004122 **
## bullying_score 2.595e-02 5.099e-02 0.509 0.610811
## suicide_score 1.574e+01 5.004e+00 3.144 0.001664 **
## school_safety_score -7.495e+00 5.781e+00 -1.296 0.194824
## fight_score 6.526e+00 2.518e+00 2.592 0.009553 **
## bully_score 7.261e+00 3.837e+00 1.892 0.058450 .
## access_laws -1.142e-01 3.183e-02 -3.586 0.000336 ***
## transport 1.805e-01 7.041e-02 2.563 0.010382 *
## screening 7.615e-02 3.493e-02 2.180 0.029274 *
## ban -3.440e-01 1.505e-01 -2.285 0.022311 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(2.0511) family taken to be 1)
##
## Null deviance: 247.01 on 195 degrees of freedom
## Residual deviance: 195.70 on 184 degrees of freedom
## (2304 observations deleted due to missingness)
## AIC: 804.12
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 2.051
## Std. Err.: 0.364
##
## 2 x log-likelihood: -778.117
1-pchisq(195.70 , df = 184)
## [1] 0.2636877
plot(DHARMa::simulateResiduals(fit_law))
if(FALSE){
ts_dat = ss_counts%>%
ungroup()%>%
select(-c(4:17,23:24,27))%>%
select(-starts_with("n"))%>%
filter(year>=2007)%>%
filter(year<2019)
ts_y = ts(ts_dat%>%ungroup()%>%select(incident_count))
ts_x = ts_dat%>%
select(c(6,9:18))%>%
mutate(population = log(population))%>%
as.matrix()
ts_fit = tsglm(ts_y, xreg = ts_x, model = list(past_obs = 2, external = c(FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,TRUE,TRUE)), link = "log", distr = "poisson")
best_fit = forecast::auto.arima(diff(ts_y), xreg = diff(ts_x))
astsa::sarima(diff(ts_y),p = 2, d = 0, q =2, xreg = diff(ts_x))
arima(diff(ts_y),order = c(2,0,2), xreg = diff(ts_x))$coef
sqrt(diag(vcov( arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x)))))
cilb= arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x))$coef-sqrt(diag(vcov( arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x)))))
ciub = arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x))$coef+sqrt(diag(vcov( arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x)))))
data.frame(Lower= cilb, Upper = ciub)
}